Imports¶

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from tkinter import *  
from sklearn.preprocessing import LabelEncoder, StandardScaler
import plotly.express as px

Data Preprocessing¶

In [2]:
df=pd.read_csv("spotify_history.csv")
arrShuffle=df["shuffle"]
arrReason_End=df["reason_end"]
arrMs_Played=df["ms_played"]
arrReason_Start=df["reason_start"]

#converting to 0's and 1's
arrReason_End = (df["reason_end"] != "trackdone").astype(int)
In [3]:
odds=[0,0,0,0]
odds[0]=0
odds[1]=0
odds[2]=0
odds[3]=0

onShuffle_Skip = 0
shuffle = 0
notShuffle_notSkip = 0
notShuffle = 0
notShuffle_Skip = 0
onShuffle_notSkip = 0

for x in range(len(arrShuffle)):
    if(arrShuffle[x]==arrShuffle[0] and arrReason_End[x]==0): #if listener was not on shuffle when playing track and they did not finish the song
        odds[0]+=1
        notShuffle_Skip += 1
    elif(arrShuffle[x]==arrShuffle[0] and arrReason_End[x]==1): #if the listener was not on shuffle when playing track and they did finish the song
        odds[1]+=1
        notShuffle_notSkip += 1
    elif(arrShuffle[x]!=arrShuffle[0] and arrReason_End[x]==0): #if the listener was on shuffle while playing the track and they did not finish the song
        odds[2]+=1
        onShuffle_Skip += 1
    else: #if the listener was on shuffle while playing the track and they did finish the song
        odds[3]+=1 
        onShuffle_notSkip += 1


        
odds[0]=odds[0]/len(arrShuffle) #prob of skipping song if listener is not on shuffle/ all shuffle
odds[1]=odds[1]/len(arrShuffle) #prob of finishing song if listener is not on shuffle/ all shuffle
odds[2]=odds[2]/len(arrShuffle) #prob of skipping song if listener is on shuffle/ all shuffle
odds[3]=odds[3]/len(arrShuffle) #prob of finishing song if listener is on shuffle/ all shuffle

plt.plot(range(len(odds)),odds)
plt.xlabel("Index")
plt.ylabel("Percentage")
plt.title("Odds")

plt.show()


for i in range(len(arrShuffle)):
  if arrShuffle[i] != arrShuffle[0]:
    shuffle += 1
  else:
    notShuffle += 1

# P(Skipping a song | We are on shuffle)
p1 = onShuffle_Skip / shuffle

# P(Not skipping | We are not on shuffle)
p2 = notShuffle_notSkip / notShuffle

# P(Skipping a song | We are not on shuffle)
p3 = notShuffle_Skip / notShuffle

# P(Not skipping | We are on shuffle)
p4 = onShuffle_notSkip / shuffle

positions = [0, 1, 2, 3]  # positions for bars
plt.bar([x for x in positions], [p1, p2, p3, p4], color=["blue", "red", "pink", "purple"])

plt.xticks(positions, ["Skipping|Shuffle", "Not Skipping|Not Shuffle", "Skipping|Not Shuffle", "Not Skipping|Shuffle"], fontsize=8, rotation=20)
plt.xlabel("Conditions")
plt.ylabel("Probability")
plt.title("Conditional Probabilities")

plt.show()
No description has been provided for this image
No description has been provided for this image
In [4]:
skippedMS={}
finishedMS={}

for x in range(len(arrMs_Played)):
    if(arrReason_End[x]==1):
        if(arrMs_Played[x]//31222.5 not in finishedMS):
            finishedMS.update({arrMs_Played[x]//31222.5:1})
        else:
            finishedMS[arrMs_Played[x]//31222.5]+=1
    else:
        if(arrMs_Played[x]//31222.5 not in skippedMS):
            skippedMS.update({arrMs_Played[x]//31222.5:1})
        else:
            skippedMS[arrMs_Played[x]//31222.5]+=1

arr1=list(skippedMS.keys())
arr1=sorted(arr1)
arr2=list(finishedMS.keys())
arr2=sorted(arr2)
arr3={}
arr4={}

for x in arr1:
    if x in arr2:
        arr3.update({x:skippedMS[x]/(skippedMS[x]+finishedMS[x])})
    else:
        arr3.update({x:1})

for x in arr2:
    if x in arr1:
        arr4.update({x:finishedMS[x]/(skippedMS[x]+finishedMS[x])})
    else:
        arr4.update({x:1})



plt.plot(list(arr3.keys()),list(arr3.values()))
plt.xlabel("Keys")
plt.ylabel("Percentages")
plt.title("Skipping odds")

plt.show()

plt.plot(list(arr4.keys()),list(arr4.values()))
plt.title("Finishing odds")
plt.show()
No description has been provided for this image
No description has been provided for this image

3-D Visualizations on Data Set¶

In [5]:
def get_seconds_range(secs):
    return float(secs)/1000


df = pd.read_csv("spotify_history.csv")
df["reason_end"] = (df["reason_end"] != "trackdone").astype(int)
df['reason_start']=df['reason_start'].apply(str)
df['shuffle']=df['shuffle'].apply(str)
df['ms_played']=df['ms_played'].apply(int)
df['secs_played'] = df['ms_played'].apply(get_seconds_range)

Xuniques, df["X"] = np.unique(df['reason_start'], return_inverse=True)
Yuniques, df["Y"] = np.unique(df['shuffle'], return_inverse= True)
df["Z"] = np.array(df['ms_played'])

fig = px.scatter_3d(df, x='reason_start', y='shuffle', z='secs_played',
                    color="reason_end",
                    hover_data=['X', 'Y', 'Z', 'reason_end'])
fig.show()
In [6]:
def get_seconds_range(secs):
    secs_played = float(secs)/1000
    if secs_played <= 10:
        return "0-10"
    elif secs_played <= 20 and secs_played > 10:
        return "11-20"
    elif secs_played <= 30 and secs_played > 20:
        return "21-30"
    else:
        return "30-end"





def displayGraph(colName):
    df=pd.read_csv("spotify_history.csv")

    df["seconds_played_range"] = df["ms_played"].apply(get_seconds_range)
    # df = df.drop(columns=['ms_played'], axis=1)

    df["reason_end"] = (df["reason_end"] != "trackdone").astype(int)
    for x in range(len(df["reason_end"])):
        if df["reason_end"][x]==1:
            df["reason_end"][x]="Played"
        else:
            df["reason_end"][x]="Skipped"

    # Create a crosstab of counts
    crosstab = pd.crosstab(df[colName], df['reason_end'])

    # Convert counts to percentages row-wise
    percentage = crosstab.div(crosstab.sum(axis=1), axis=0) * 100

    # Plot stacked bar chart
    percentage.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='tab20c')
    plt.title('Percentage Breakdown of reason_end by '+colName)
    plt.xlabel('Reason Start')
    plt.ylabel('Percentage (%)')
    plt.legend(title='Reason End', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()


categories=["reason_start","seconds_played_range","shuffle"]
for x in categories:
    displayGraph(x)
/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: FutureWarning:

ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Played' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

No description has been provided for this image
/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: FutureWarning:

ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Played' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

No description has been provided for this image
/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: FutureWarning:

ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/yn/1sl94ttd7sz7kt82dfnzr0p00000gn/T/ipykernel_11349/287600621.py:25: FutureWarning:

Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Played' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.

No description has been provided for this image

Model Training - Predicting Skipped¶

In [7]:
#training model
data = pd.read_csv("spotify_history.csv")
data = data.drop(columns=['ts'], axis=1)
data = data.dropna()

data.to_csv("data.csv", index=False)

#converting to 0's and 1's
data['reason_end'] = (data["reason_end"] != "trackdone").astype(int)

X_train, X_test, Y_train, Y_test = train_test_split(
    data.drop(['reason_end'], axis=1),
    data['reason_end'],
    test_size=1/4.0,
    random_state=42
)

def calc_weights(set):
    counts = Counter(set)
    samples = len(set)

    weight = {item: samples / (c * len(counts)) for item, c in counts.items()}
    return weight

weights = calc_weights(Y_train)

categorical = X_train.select_dtypes(include=['object']).columns
numerical = X_train.select_dtypes(include=['number']).columns

print(categorical)
print(numerical)

cat = Pipeline([
    ('one_hot', OneHotEncoder(handle_unknown='ignore')),
])

num = Pipeline([
    ('simple', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

col_trans = ColumnTransformer([
    ('numerical', num, numerical),
    ('categorical', cat, categorical),
])

model = Pipeline(steps=[
    ('col', col_trans),
    ('svm', LinearSVC(
        max_iter=1000,
        penalty="l2",
        class_weight=weights,
        C=0.1,
    ))
])

X_train.to_csv("x_train.csv", index=False)
Y_train.to_csv("y_train.csv", index=False)

model.fit(X_train, Y_train)
Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start'],
      dtype='object')
Index(['ms_played'], dtype='object')
Out[7]:
Pipeline(steps=[('col',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('simple',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['ms_played'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start'],
      dtype='object'))])),
                ('svm',
                 LinearSVC(C=0.1,
                           class_weight={0: 0.9719758902591105,
                                         1: 1.0296880733944953}))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('col',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('simple',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['ms_played'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start'],
      dtype='object'))])),
                ('svm',
                 LinearSVC(C=0.1,
                           class_weight={0: 0.9719758902591105,
                                         1: 1.0296880733944953}))])
ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('simple',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['ms_played'], dtype='object')),
                                ('categorical',
                                 Pipeline(steps=[('one_hot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start'],
      dtype='object'))])
Index(['ms_played'], dtype='object')
SimpleImputer(strategy='median')
StandardScaler()
Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start'],
      dtype='object')
OneHotEncoder(handle_unknown='ignore')
LinearSVC(C=0.1, class_weight={0: 0.9719758902591105, 1: 1.0296880733944953})

Accuracy on Skipping¶

In [8]:
Y_train_pred = model.predict(X_train)
print("Accuracy on training set = ", accuracy_score(Y_train, Y_train_pred))
Y_test_pred = model.predict(X_test)
print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_pred))
print(classification_report(Y_test, Y_test_pred))

# Plot a confusion matrix -- commented out because it would take a while to run sometimes
cm = confusion_matrix(Y_test, Y_test_pred)
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Accuracy on training set =  0.9633540040628675
Accuracy on testing set =  0.9556024804875441
              precision    recall  f1-score   support

           0       0.93      0.98      0.96     19374
           1       0.98      0.93      0.95     18038

    accuracy                           0.96     37412
   macro avg       0.96      0.95      0.96     37412
weighted avg       0.96      0.96      0.96     37412

No description has been provided for this image

Model Training - Seconds Played¶

In [9]:
def get_seconds_range(secs):
    secs_played = float(secs)/1000
    if secs_played <= 10:
        return "0-10"
    elif secs_played <= 20 and secs_played > 10:
        return "11-20"
    elif secs_played <= 30 and secs_played > 20:
        return "21-30"
    else:
        return "30-end"



#training model
dataTwo = pd.read_csv("spotify_history.csv")
dataTwo = dataTwo.drop(columns=['ts'], axis=1)
dataTwo = dataTwo.dropna()

#extracting feature
dataTwo["seconds_played_range"] = dataTwo["ms_played"].apply(get_seconds_range)
# dataTwo = dataTwo.drop(columns=['ms_played'], axis=1)



X_train, X_test, Y_train, Y_test = train_test_split(
    dataTwo.drop(['seconds_played_range'], axis=1),
    dataTwo['seconds_played_range'],
    test_size=1/4.0,
    random_state=42
)

def calc_weights(set):
    counts = Counter(set)
    samples = len(set)

    weight = {item: samples / (c * len(counts)) for item, c in counts.items()}
    return weight

weights = calc_weights(Y_train)

categorical = X_train.select_dtypes(include=['object']).columns
numerical = X_train.select_dtypes(include=['number']).columns

print(categorical)
print(numerical)

cat = Pipeline([
    ('one_hot', OneHotEncoder(handle_unknown='ignore')),
])

num = Pipeline([
    ('simple', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

col_trans = ColumnTransformer([
    ('numerical', num, numerical),
    ('categorical', cat, categorical),
])

modelTwo = Pipeline(steps=[
    ('col', col_trans),
    ('svm', LinearSVC(
        max_iter=1000,
        penalty="l2",
        class_weight=weights,
        C=0.1,
    ))
])

# X_train.to_csv("x_train.csv", index=False)
Y_train.to_csv("y2_train.csv", index=False)

modelTwo.fit(X_train, Y_train)
Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start', 'reason_end'],
      dtype='object')
Index(['ms_played'], dtype='object')
Out[9]:
Pipeline(steps=[('col',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('simple',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['ms_played'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start', 'reason_end'],
      dtype='object'))])),
                ('svm',
                 LinearSVC(C=0.1,
                           class_weight={'0-10': 0.764654585093337,
                                         '11-20': 9.039626288659793,
                                         '21-30': 14.45595054095827,
                                         '30-end': 0.39802258284157965}))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('col',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('simple',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['ms_played'], dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('one_hot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start', 'reason_end'],
      dtype='object'))])),
                ('svm',
                 LinearSVC(C=0.1,
                           class_weight={'0-10': 0.764654585093337,
                                         '11-20': 9.039626288659793,
                                         '21-30': 14.45595054095827,
                                         '30-end': 0.39802258284157965}))])
ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('simple',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['ms_played'], dtype='object')),
                                ('categorical',
                                 Pipeline(steps=[('one_hot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start', 'reason_end'],
      dtype='object'))])
Index(['ms_played'], dtype='object')
SimpleImputer(strategy='median')
StandardScaler()
Index(['spotify_track_uri', 'platform', 'track_name', 'artist_name',
       'album_name', 'reason_start', 'reason_end'],
      dtype='object')
OneHotEncoder(handle_unknown='ignore')
LinearSVC(C=0.1,
          class_weight={'0-10': 0.764654585093337, '11-20': 9.039626288659793,
                        '21-30': 14.45595054095827,
                        '30-end': 0.39802258284157965})

Accuracy on Range of Seconds Played¶

In [10]:
Y_train_pred = modelTwo.predict(X_train)
print("Accuracy on training set = ", accuracy_score(Y_train, Y_train_pred))
Y_test_pred = modelTwo.predict(X_test)
print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_pred))
print(classification_report(Y_test, Y_test_pred))

# Plot a confusion matrix -- commented out because it would take a while to run sometimes
cm = confusion_matrix(Y_test, Y_test_pred)
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Accuracy on training set =  0.9798549485013721
Accuracy on testing set =  0.9562172564952421
              precision    recall  f1-score   support

        0-10       0.97      0.97      0.97     12219
       11-20       0.40      0.38      0.39      1022
       21-30       0.30      0.34      0.32       632
      30-end       0.99      0.99      0.99     23539

    accuracy                           0.96     37412
   macro avg       0.66      0.67      0.67     37412
weighted avg       0.96      0.96      0.96     37412

No description has been provided for this image

3-D Visualization on Testing Results¶

In [11]:
def get_seconds_range(secs):
    return float(secs)/1000


df = X_test
df["reason_end"] = Y_test
df['reason_start']=df['reason_start'].apply(str)
df['shuffle']=df['shuffle'].apply(str)
df['ms_played']=df['ms_played'].apply(int)
df['secs_played'] = df['ms_played'].apply(get_seconds_range)

Xuniques, df["X"] = np.unique(df['reason_start'], return_inverse=True)
Yuniques, df["Y"] = np.unique(df['shuffle'], return_inverse= True)
df["Z"] = np.array(df['ms_played'])

fig = px.scatter_3d(df, x='reason_start', y='shuffle', z='secs_played',
                    color="reason_end",
                    hover_data=['X', 'Y', 'Z', 'reason_end'])
fig.show()